Raw data

- GEO number: GSE128615

- Study summary: To determine the influence of differential Kdm6a expression in immune cells, whole transcriptome analysis for CD4+ T cells from WT and Kdm6a cKO mice were performed using RNA-Seq.

Loading packages

library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(pheatmap)
library(gridExtra)
library(UpSetR)
library(apeglm)
library(ashr)

Setting AnnotationHub

AnnotationSpecies <- "mus musculus"  # Assign your species 
ah <- AnnotationHub(hub=getAnnotationHubOption("URL"))   # Bring annotation DB

Running AnnotationHub

# Filter annotation of interest
ahQuery <- query(ah, c("OrgDb", AnnotationSpecies))      

if (length(ahQuery) == 1) {
    DBName <- names(ahQuery)
} else if (length(ahQuery) > 1) {
               DBName <- names(ahQuery)[1]
} else {
    print("You don't have a valid DB")
    rmarkdown::render() 
} 

AnnoDb <- ah[[DBName]] # Store into an OrgDb object  

# Explore your OrgDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="ENSEMBLTRANS")

# Note: Annotation has to be done with not genome but transcripts 
AnnoDb <- select(AnnoDb, 
                 AnnoKey,
                 keytype="ENSEMBLTRANS",
                 columns="ENSEMBL")

# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)
##         ENSEMBLTRANS            ENSEMBL
## 1 ENSMUST00000030010 ENSMUSG00000015243
## 2 ENSMUST00000149127 ENSMUSG00000015243
## 3 ENSMUST00000033695 ENSMUSG00000031333
## 4 ENSMUST00000140333 ENSMUSG00000031333
## 5 ENSMUST00000236391 ENSMUSG00000024030
## 6 ENSMUST00000024829 ENSMUSG00000024030

Defining file name and path for .sf files

.sf files have been created from fastq data by salmon

# This code chunk needs to be written by yourself 
#
# Define sample names 
SampleNames <-  c(paste0("WT-rep", 1:3), paste0("cKO-rep", 1:3)) 

# Define group level
GroupLevel <- c("WT", "cKO")

# Define contrast for DE analysis
Contrast <- c("Group", "cKO", "WT")


# Define a vector for comparing TPM vs Counts effect 
TvC <- c("TPM", "Counts")
levels(TvC) <- TvC

# Define .sf file path
sf <- c(paste0("salmon_output/", 
               SampleNames,
               ".salmon_quant/quant.sf"))

# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))

# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
                       Group=factor(group, levels=GroupLevel),
                       Path=sf)

rownames(metadata) <- SampleNames

# Explore the metadata
print(metadata)
##            Sample Group                                         Path
## WT-rep1   WT-rep1    WT  salmon_output/WT-rep1.salmon_quant/quant.sf
## WT-rep2   WT-rep2    WT  salmon_output/WT-rep2.salmon_quant/quant.sf
## WT-rep3   WT-rep3    WT  salmon_output/WT-rep3.salmon_quant/quant.sf
## cKO-rep1 cKO-rep1   cKO salmon_output/cKO-rep1.salmon_quant/quant.sf
## cKO-rep2 cKO-rep2   cKO salmon_output/cKO-rep2.salmon_quant/quant.sf
## cKO-rep3 cKO-rep3   cKO salmon_output/cKO-rep3.salmon_quant/quant.sf

Converting .sf files to txi list

- txi_tpm: stores TPM with the argument “countsFromAbundance=”lengthScaledTPM"

- txi_counts: stores original counts

- In this project, two txi objects were created with or without the countsFromAbundance=“lengthScaledTPM” argument and compared in downstream DE analysis.

- If you don’t want gene-level summarization, set txOut=TRUE.

- References: tximport doc, DESeq2 doc “Why unnormalized counts?”, Soneson et al. 2016, Developer Dr. Love’s comment, Harvard Chan Bioinformatics Core workshop

# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames

# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    countsFromAbundance="lengthScaledTPM", # Extracts TPM 
                    ignoreTxVersion=T) 

txi_counts <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    ignoreTxVersion=T) 

# tpm 
head(txi_tpm$counts)
##                        WT-rep1     WT-rep2     WT-rep3    cKO-rep1    cKO-rep2
## ENSMUSG00000000001 6564.991768 5802.833633 5645.878976 5997.157707 6111.402833
## ENSMUSG00000000056 1575.371304 1432.872026 1309.555830 1526.721207 1658.347561
## ENSMUSG00000000058    4.982011    5.051528    4.115651    2.925755    1.002572
## ENSMUSG00000000078 3333.818365 2984.544915 2808.165368 3516.329828 3680.185701
## ENSMUSG00000000088 4490.283446 4141.351728 3882.312160 4167.144948 4205.359513
## ENSMUSG00000000093    9.362193    8.033400    7.795037    7.122818    4.059814
##                       cKO-rep3
## ENSMUSG00000000001 5361.224091
## ENSMUSG00000000056 1574.527535
## ENSMUSG00000000058    2.928238
## ENSMUSG00000000078 2833.866380
## ENSMUSG00000000088 3682.294112
## ENSMUSG00000000093    4.185473
dim(txi_tpm$counts)
## [1] 15022     6
# counts
head(txi_counts$counts)
##                    WT-rep1  WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001    6492 5822.000    5614 6040.000 6142.000 5437.000
## ENSMUSG00000000056    1637 1458.000    1395 1579.000 1608.999 1398.000
## ENSMUSG00000000058       5    5.000       4    3.000    1.000    3.000
## ENSMUSG00000000078    3344 2807.000    2828 3589.000 3731.000 2912.000
## ENSMUSG00000000088    4558 4149.929    3958 4189.921 4151.000 3623.928
## ENSMUSG00000000093      10    8.000       8    7.000    4.000    4.000
dim(txi_counts$counts)
## [1] 15022     6

Creating DESeq objects from txi and VST

- Note: The tximport-to-DESeq2 approach uses estimated gene counts from the transcript abundance quantifiers, but not normalized counts.

- The DESeqDataSetFromTximport() function generated an DESeq object (aka dds) with the txi input.

- vst() was run to perform variance stabilizing transformation instead of rlog() which takes longer time with similar characteristics.

- The vsd object created by vst() is used for not DE analysis but QC.

- References: DESeq2 doc “Transcript abundance files”, DESeq2 doc “Variance stabilizing transformation”

# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) { 

    # Create a DESeq object (so-calledd dds) 
    des <- DESeqDataSetFromTximport(txi, 
                                    colData=metadata,
                                    design=~Group)

    # Create a vsd object (so-called vsd) 
    ves <- vst(des, blind=T)

    # Output them as a list 
    return(list(dds=des, vsd=ves))

}

TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']] 
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]

Exploring created dds

# TPM 
TPM[[1]]
## class: DESeqDataSet 
## dim: 15022 6 
## metadata(1): version
## assays(1): counts
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
##   ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(0):
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
##                    WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001    6565    5803    5646     5997     6111     5361
## ENSMUSG00000000056    1575    1433    1310     1527     1658     1575
## ENSMUSG00000000058       5       5       4        3        1        3
## ENSMUSG00000000078    3334    2985    2808     3516     3680     2834
## ENSMUSG00000000088    4490    4141    3882     4167     4205     3682
## ENSMUSG00000000093       9       8       8        7        4        4
# Counts
Counts[[1]]
## class: DESeqDataSet 
## dim: 15022 6 
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
##   ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(0):
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
##                    WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001    6492    5822    5614     6040     6142     5437
## ENSMUSG00000000056    1637    1458    1395     1579     1609     1398
## ENSMUSG00000000058       5       5       4        3        1        3
## ENSMUSG00000000078    3344    2807    2828     3589     3731     2912
## ENSMUSG00000000088    4558    4150    3958     4190     4151     3624
## ENSMUSG00000000093      10       8       8        7        4        4

Exploring created vsd

# TPM
TPM[[2]]
## class: DESeqTransform 
## dim: 15022 6 
## metadata(1): version
## assays(1): ''
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
##   ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform 
## dim: 15022 6 
## metadata(1): version
## assays(1): ''
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
##   ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path

Estimating size factors, dispersions, and conducting Wald Test

- Dispersion is calculated as a measure of variation instead of variance since variance gets larger when gene expression gets higher.

- Wald test is the default setting of DESeq2 which tests null hypothesis between two groups. You should use Likelihood ratio test (LRT) when comparing more than two groups.

- Messages when “Counts <- DESeqPrep_fn(Counts)” was run: using ‘avgTxLength’ from assays(dds)

- References: Harvard Chan Bioinformatics Core workshop I, Harvard Chan Bioinformatics Core workshop II, Harvard Chan Bioinformatics Core workshop III, DESeq2 "Wald test indivisual steps, DESeq2 doc “Likelihood ratio test”

# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
    
    List[[1]] <- estimateSizeFactors(List[[1]])
    List[[1]] <- estimateDispersions(List[[1]])
    List[[1]] <- nbinomWaldTest(List[[1]])
   
    return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts) 
TPM <- DESeqPrep_fn(TPM)

Exploring size factors

sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
##   WT-rep1   WT-rep2   WT-rep3  cKO-rep1  cKO-rep2  cKO-rep3 
## 1.1096017 1.0008049 0.9706968 1.0240773 1.0397806 0.8919291
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead! 
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
##            Sample    Group                   Path
##          <factor> <factor>            <character>
## WT-rep1  WT-rep1       WT  salmon_output/WT-rep..
## WT-rep2  WT-rep2       WT  salmon_output/WT-rep..
## WT-rep3  WT-rep3       WT  salmon_output/WT-rep..
## cKO-rep1 cKO-rep1      cKO salmon_output/cKO-re..
## cKO-rep2 cKO-rep2      cKO salmon_output/cKO-re..
## cKO-rep3 cKO-rep3      cKO salmon_output/cKO-re..
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
##            Sample    Group                   Path sizeFactor
##          <factor> <factor>            <character>  <numeric>
## WT-rep1  WT-rep1       WT  salmon_output/WT-rep..   1.109602
## WT-rep2  WT-rep2       WT  salmon_output/WT-rep..   1.000805
## WT-rep3  WT-rep3       WT  salmon_output/WT-rep..   0.970697
## cKO-rep1 cKO-rep1      cKO salmon_output/cKO-re..   1.024077
## cKO-rep2 cKO-rep2      cKO salmon_output/cKO-re..   1.039781
## cKO-rep3 cKO-rep3      cKO salmon_output/cKO-re..   0.891929

Plotting the size factors in TPM

- The size factors are only available from TPM dds

- Blue dashed line: normalization factor = 1

# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))

colnames(sizeFactor) <- 'Size_Factor'

sizeFactor <- sizeFactor %>%
    rownames_to_column(var="Sample") %>%
    inner_join(metadata[, 1:ncol(metadata)-1], by="Sample") 

sizeFactor$Sample <- factor(sizeFactor$Sample, levels=SampleNames)



# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample, 
                       y=Size_Factor, 
                       fill=Group,
                       label=Size_Factor)) +
    geom_bar(stat="identity", width=0.8) +
    theme_bw() + 
    ggtitle("Size Factors in TPM-DESeq") +
    geom_text(vjust=1.5) +
    theme(axis.text.x=element_text(angle=45, 
                                   vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")

Plotting nornalization factors in the Counts

- DESeq2 performs an internal normalization where geometric mean is calculated for each gene across all samples. The counts for a gene in each sample is then divided by this mean. The median of these ratios in a sample is the size factor for that sample.

- Blue dashed line: normalization factor = 1

- Colored dots: normlization factors per gene (y-axis) in each sample

- Box plots: distribution of the normalization facters in each group (see the second plot)

- Reference: DESeq2 doc “Sample-/gene-dependent normalization factors”

# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
    gather(Sample, Normalization_Factor) %>%
    inner_join(metadata[, 1:2], by="Sample") 
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors 
normFactors_plot <- ggplot(normf, 
       aes(x=Sample, y=Normalization_Factor)) + 
geom_jitter(alpha=0.5, aes(color=Group)) + 
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor), 
             outlier.shape=NA, alpha=0.5) + 
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45, 
                               vjust=0.5)) + 
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1, 
           color="blue", 
           linetype="dashed")
# Print the normalization factor scatter plot 
print(normFactors_plot)

# Print the same plot with larger y-magnification in order to observe the box plot 
normFactors_plot + 
    ylim(0.8, 1.2)

Setting functions for QC plots

- Reference: DESeq2 doc “Principal component plot of the samples”, DESeq2 doc “Heatmap of the sample-to-sample distances”

# Assigne what to compare
GroupOfInterest <- Contrast[1] 

# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {

    plotPCA(inputList[[2]],    # takes vsd
            intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}

# Set heatmap annotation 
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]

# Set a function for a correlation heatmap 
QCcorrHeatmap_fn <- function(inputList, Title) {

    # Extract transformed count matrix
    mtx <- assay(inputList[[2]])      # takes vsd

    # Calculate correlation and store in the matrix
    mtx <- cor(mtx)
    
    # Create a correlation heatmap
    return(pheatmap(mtx, 
             annotation=HeatmapAnno,
             main=paste("Sample Correlation Heatmap:",
                        Title)))
}

Sample QC: Principal Component Analysis (PCA)

- Checkpoints in PCA: source of variation, sample outlier

grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"), 
             QCPCA_fn(Counts, "QC PCA: Counts"), 
             nrow=2)

Sample QC: Sample Correlation Heatmap

- Checkpoints of correlation heatmap: distance between samples, correlation in a group

- Upper: TPM input

- Lower: Counts input

# TPM
QCcorrHeatmap_fn(TPM, "TPM") 

# Counts
QCcorrHeatmap_fn(Counts, "Counts") 

Running DE analysis with or without shrinkage

- Shrinkage reduces false positives

- Magnitude of shrinkage is affected by dispersion and sample size

- When the argument type is set to “apeglm”, the coef argument is used instead of constrast. In this dataset, you can set coef=Coef where Coef <- resultsNames(ddsList[[1]]).

- When the type is set to “normal”, the argument contrast is set as shown below.

- References: DESeq2 doc “Alternative shrinkage estimators”, Harvard Chan Bioinformatics Core workshop

# Create a list consisted with dds objects from TPM and Counts
desList <- list(TPM[[1]], Counts[[1]]) 
names(desList) <- TvC

# Create a list for TPM and Counts dds 
ddsList <- desList  # DE without shrinkage  
normal.ddsList <- desList    # DE with "normal" shrinkage
ape.ddsList <- desList       # DE with "apeglm" shrinkage
ash.ddsList <- desList       # DE with "ashr" shrinkage

for (x in TvC) {
    
    # Run DESeq() 
    ddsList[[x]] <- DESeq(desList[[x]])
    print(resultsNames(ddsList[[x]]))

    normal.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                                contrast=Contrast,
                                type="normal")

    ape.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                             coef=resultsNames(ddsList[[x]])[2],
                             type="apeglm")

    ash.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                             coef=resultsNames(ddsList[[x]])[2],
                             type="ashr")

}
## [1] "Intercept"       "Group_cKO_vs_WT"
## [1] "Intercept"       "Group_cKO_vs_WT"
# Combine every ddsList into a list
all.ddsList <- list(ddsList, normal.ddsList, ape.ddsList, ash.ddsList)
shrinkage <- c("None", "Normal", "Apeglm", "Ashr")
names(all.ddsList) <- shrinkage

Creating dispersion plots

- Dispersion is important since estimation by DESeq2 algorithm is based on the assumption that genes with similar expression levels have similar dispersion. If an RNA-seq dataset doesn’t satisfy this assumption, use other DE algorithms than DESeq2.

- References: DESeq2 doc "Dispersion plot and fitting alternatives, Harvard Chan Bioinformatics Core workshop

# Plot dispersion  
for (x in TvC) {

    plotDispEsts(ddsList[[x]], 
                 ylab="Dispersion", 
                 xlab="Mean of Normalized Counts", 
                 main=paste("Dispersion of", x, "Input"))
}

Extracting DE results with or without shrinkage

- The alpha denotes threshold of false discovery rate (FDR) assigned by users.

- In this analysis, the alpha is set to 0.1

# Set FDR threshold 
alpha=0.1 

# FDR threshold vector
FDRv=c("< 0.1", "> 0.1") 

# Initialize lists of result tables 
all.resList <- all.ddsList 

# Set a function cleaning table
Sig.fn <- function(df, Input) {
    
    df <- df %>% 
        rownames_to_column(var="Gene") %>%
        mutate(FDR=ifelse(padj < 0.1 & !is.na(padj), 
                                   FDRv[1], 
                                   FDRv[2]), 
               Input=Input) 
    return(df)
}




for (i in shrinkage) {

    if (i == "None") {

        for (x in TvC) {

        # Extract data frames from unshrunken lfc & clean data 
        all.resList[[i]][[x]] <- as.data.frame(results(all.ddsList[[i]][[x]], 
                                                       contrast=Contrast, 
                                                       alpha=alpha)) %>% 
        Sig.fn(x)

         } 
    } else {

        # Extract data frames from shrunken lfc & clean data
        for (x in TvC) {

            all.resList[[i]][[x]] <- as.data.frame(all.ddsList[[i]][[x]]) %>%
                Sig.fn(x)
    

        }
    }
}





# Explore the results 
summary(all.resList)
##        Length Class  Mode
## None   2      -none- list
## Normal 2      -none- list
## Apeglm 2      -none- list
## Ashr   2      -none- list
head(all.resList[[1]][['TPM']])
##                 Gene    baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5879.180418    0.017252845 0.05844104  0.2952180
## 2 ENSMUSG00000000056 1508.720447    0.207064498 0.09848660  2.1024636
## 3 ENSMUSG00000000058    3.479592   -0.922194788 1.11921063 -0.8239689
## 4 ENSMUSG00000000078 3171.662271    0.193060324 0.07734540  2.4960801
## 5 ENSMUSG00000000088 4070.772911    0.006760457 0.06174987  0.1094813
## 6 ENSMUSG00000000093    6.585522   -0.680363745 0.80291393 -0.8473682
##       pvalue      padj   FDR Input
## 1 0.76782737 0.9757786 > 0.1   TPM
## 2 0.03551268 0.3063213 > 0.1   TPM
## 3 0.40995721        NA > 0.1   TPM
## 4 0.01255742 0.1648302 > 0.1   TPM
## 5 0.91282074 0.9936226 > 0.1   TPM
## 6 0.39678991        NA > 0.1   TPM
head(all.resList[[1]][['Counts']])
##                 Gene    baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5914.744236    0.017418051 0.06091612  0.2859350
## 2 ENSMUSG00000000056 1515.414428    0.206534290 0.09949552  2.0758149
## 3 ENSMUSG00000000058    3.500628   -0.965055656 1.12473916 -0.8580262
## 4 ENSMUSG00000000078 3189.467191    0.193219406 0.07898093  2.4464059
## 5 ENSMUSG00000000088 4095.812572    0.006826617 0.06412983  0.1064499
## 6 ENSMUSG00000000093    6.711966   -0.653859194 0.80313407 -0.8141345
##       pvalue      padj   FDR  Input
## 1 0.77492790 0.9770194 > 0.1 Counts
## 2 0.03791107 0.3138926 > 0.1 Counts
## 3 0.39087800        NA > 0.1 Counts
## 4 0.01442885 0.1789632 > 0.1 Counts
## 5 0.91522537 0.9928907 > 0.1 Counts
## 6 0.41556788        NA > 0.1 Counts
head(all.resList[[2]][['TPM']])
##                 Gene    baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5879.180418    0.016861268 0.05711473  0.2952180
## 2 ENSMUSG00000000056 1508.720447    0.194251645 0.09239231  2.1024636
## 3 ENSMUSG00000000058    3.479592   -0.096922367 0.11921763 -0.8239689
## 4 ENSMUSG00000000078 3171.662271    0.185515335 0.07432200  2.4960801
## 5 ENSMUSG00000000088 4070.772911    0.006589589 0.06018941  0.1094813
## 6 ENSMUSG00000000093    6.585522   -0.126602312 0.14998825 -0.8473682
##       pvalue      padj   FDR Input
## 1 0.76782737 0.9757786 > 0.1   TPM
## 2 0.03551268 0.3063213 > 0.1   TPM
## 3 0.40995721        NA > 0.1   TPM
## 4 0.01255742 0.1648302 > 0.1   TPM
## 5 0.91282074 0.9936226 > 0.1   TPM
## 6 0.39678991        NA > 0.1   TPM
head(all.resList[[2]][['Counts']])
##                 Gene    baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5914.744236    0.016989832 0.05941861  0.2859350
## 2 ENSMUSG00000000056 1515.414428    0.193518482 0.09322728  2.0758149
## 3 ENSMUSG00000000058    3.500628   -0.100648276 0.11891890 -0.8580262
## 4 ENSMUSG00000000078 3189.467191    0.185366747 0.07577029  2.4464059
## 5 ENSMUSG00000000088 4095.812572    0.006641088 0.06238723  0.1064499
## 6 ENSMUSG00000000093    6.711966   -0.121768425 0.15012697 -0.8141345
##       pvalue      padj   FDR  Input
## 1 0.77492790 0.9770194 > 0.1 Counts
## 2 0.03791107 0.3138926 > 0.1 Counts
## 3 0.39087800        NA > 0.1 Counts
## 4 0.01442885 0.1789632 > 0.1 Counts
## 5 0.91522537 0.9928907 > 0.1 Counts
## 6 0.41556788        NA > 0.1 Counts

Exploring mean-difference relationship with MA plots

- x-axis: expression level (baseMean))

- y-axis: fold change (log2FoldChange)

- Red dashed lines: log2FoldChange = -1 and 1

- Reference: DESeq2 doc “MA-plot”

# Set ylim: has to adjusted by users depending on data 
yl <- c(-7.5, 7.5)

# Set min log2 fold change of interest 
mLog <- c(-1, 1)

# Initialize a list storing MA plots
MAList <- ddsList


# Create MA plots

for (i in shrinkage) {

    both.df <- rbind(all.resList[[i]][[TvC[1]]], 
                     all.resList[[i]][[TvC[2]]])

    MAList[[i]] <- ggplot(both.df, 
                              aes(x=baseMean, y=log2FoldChange, color=FDR))  +geom_point() + scale_x_log10() + facet_grid(~Input) + 
                                   theme_bw() + 
                                   scale_color_manual(values=c("blue", "grey")) +
                                   ggtitle(paste("MA plot with", i)) + 
                                   ylim(yl[1], yl[2]) + 
                                   geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red") 

}

   

# Print MA plots
MAList
## $TPM
## class: DESeqDataSet 
## dim: 15022 6 
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
##   ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(4): Sample Group Path sizeFactor
## 
## $Counts
## class: DESeqDataSet 
## dim: 15022 6 
## metadata(1): version
## assays(6): counts avgTxLength ... H cooks
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
##   ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
## 
## $None

## 
## $Normal

## 
## $Apeglm

## 
## $Ashr

Exploring distribution of false discovery rate (FDR)

- Distribution of adjusted p-val (FDR) was presented

- x-axis: FDR

- y-axis: Number of genes

- Black dashed line: FDR = alpha

# Subset rows with FDR < alpha
both.df <- rbind(all.resList[[1]][['TPM']], 
                 all.resList[[1]][['Counts']])
both.df$Input <- factor(both.df$Input, levels=TvC)

# Plot distribution of FDR
ggplot(both.df,
       aes(x=padj, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() +
ggtitle("Distribution of False Discovery Rate (FDR)") + 
xlab("Adjusted P-Value (FDR)") + 
ylab("Count") + 
geom_vline(xintercept=alpha, 
           color="black", 
           linetype="dashed",
           size=1) + 
scale_x_continuous(breaks=seq(0, 1, by=0.1)) 

Exploring distribution of log2FoldChange by input type

- Black dashed lines: log2FoldChange = -1 and 1

- x-axis: gene expression level (log2FoldChange)

- y-axis: number of genes

# Initialize a lfc list
lfcplotList <- all.resList 

# Create lfc distribution plots
for (i in shrinkage) {

    lfc.df <- rbind(all.resList[[i]][[TvC[1]]], 
                    all.resList[[i]][[TvC[2]]])

    lfc.df <- lfc.df[lfc.df$FDR == "< 0.1",]

    lfc.df$Input <- factor(lfc.df$Input, levels=TvC)

    lfcplotList[[i]] <- ggplot(lfc.df,  # Subset rows with FDR < alpha
                               aes(x=log2FoldChange, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() + ylab("Count") + 
geom_vline(xintercept=c(mLog[1], mLog[2]), 
           color="black", 
           linetype="dashed", 
           size=1) + 
ggtitle(paste("Distribution of Log2FoldChange by Input Type:", i)) + 
xlim(-5, 5)


}

# Print the lfc plots
lfcplotList
## $None

## 
## $Normal

## 
## $Apeglm

## 
## $Ashr

NA statistics: zero count genes & outlier genes

When NAs appear in

- log2FoldChange: zero counts in all samples

- padj: too little information

- pval & padj: at least one replicate was an outlier

# Count number of NA genes  
type=c("Zero Counts", "Outliers", "Total NA Genes") 


# Create a data frame storing NA gene number
NAstat <- both.df %>%
    group_by(Input) %>%
    summarize(zero=sum(is.na(log2FoldChange)), 
              outlier=sum(is.na(pvalue) & is.na(padj))) %>%
    mutate(total=zero + outlier) %>%
    gather(Type, Number, -Input) %>%
    mutate(Type=factor(case_when(Type == "zero" ~ type[1], 
                                 Type == "outlier" ~ type[2], 
                                 Type == "total" ~ type[3]), 
                       levels=type))

# Plot number of NA genes 
ggplot(NAstat, 
       aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) + 
    geom_bar(stat="identity", position="dodge") + 
    theme_bw() +
    geom_text(position=position_dodge(width=1), vjust=1.5) + 
    ggtitle("Number of NA Genes") + 
    ylab("Number of Genes")

Ranking DEGs with the TPM and original count inputs

- fdr.rank: ranked by FDR

- lfc.rank: ranked by absolute fold change

- up.lfc.rank: ranked by magnitude of fold change increase

- down.lfc.rank: ranked by manitude of fold change decrease

# Create a new list having DE table with FDR below alpha
fdr.rank <- all.resList
lfc.rank <- all.resList
up.lfc.rank <- all.resList
down.lfc.rank <- all.resList

# Set a function cleaning a data frame 
filter.fdr.fn <- function(df) {as.data.table(df[df$FDR == FDRv[1],])}

# Set a function creating a column for the rank
Ranking.fn <- function(x) {mutate(x, Rank=1:nrow(x))}


for (i in shrinkage) {

    for (x in TvC) {

        rankdf <- all.resList[[i]][[x]]

        fdr.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(padj) %>% Ranking.fn() 

        lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(desc(abs(log2FoldChange))) %>% Ranking.fn()

        up.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% 
            arrange(desc(log2FoldChange)) %>% 
            Ranking.fn()

        down.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>%
            arrange(log2FoldChange) %>%
            Ranking.fn()

    }
}

# Explore the ranking outputs
head(fdr.rank[[1]][[1]])
##                  Gene   baseMean log2FoldChange     lfcSE       stat
## 1: ENSMUSG00000022018   638.2601     -1.2818702 0.1145578 -11.189728
## 2: ENSMUSG00000040498   713.2059     -2.2334532 0.2151601 -10.380425
## 3: ENSMUSG00000032011 16087.6748     -1.9132468 0.1853843 -10.320433
## 4: ENSMUSG00000047180  1002.6988     -1.4158861 0.1397523 -10.131398
## 5: ENSMUSG00000020689  2308.2182     -1.5344663 0.1555018  -9.867836
## 6: ENSMUSG00000024065  1102.5550     -0.9671978 0.1170542  -8.262819
##          pvalue         padj   FDR Input Rank
## 1: 4.578363e-29 2.152288e-25 < 0.1   TPM    1
## 2: 3.044162e-25 7.155302e-22 < 0.1   TPM    2
## 3: 5.696543e-25 8.926483e-22 < 0.1   TPM    3
## 4: 4.008763e-24 4.711299e-21 < 0.1   TPM    4
## 5: 5.738939e-23 5.395750e-20 < 0.1   TPM    5
## 6: 1.422715e-16 1.114697e-13 < 0.1   TPM    6
head(fdr.rank[[1]][[2]])
##                  Gene   baseMean log2FoldChange     lfcSE       stat
## 1: ENSMUSG00000022018   641.9707      -1.280927 0.1159692 -11.045405
## 2: ENSMUSG00000040498   717.1636      -2.233531 0.2135438 -10.459359
## 3: ENSMUSG00000032011 16169.4283      -1.913100 0.1840727 -10.393178
## 4: ENSMUSG00000047180  1008.5805      -1.416004 0.1400262 -10.112417
## 5: ENSMUSG00000020689  2322.4101      -1.534429 0.1547689  -9.914329
## 6: ENSMUSG00000024065  1109.4381      -0.966510 0.1177500  -8.208153
##          pvalue         padj   FDR  Input Rank
## 1: 2.307255e-28 1.044956e-24 < 0.1 Counts    1
## 2: 1.327505e-25 3.006135e-22 < 0.1 Counts    2
## 3: 2.663268e-25 4.020647e-22 < 0.1 Counts    3
## 4: 4.866840e-24 5.510480e-21 < 0.1 Counts    4
## 5: 3.606716e-23 3.266964e-20 < 0.1 Counts    5
## 6: 2.246172e-16 1.695486e-13 < 0.1 Counts    6
head(lfc.rank[[1]][[1]])
##                  Gene  baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSMUSG00000078926  9.607045      -4.750698 1.6717195 -2.841804 4.485912e-03
## 2: ENSMUSG00000035606  9.077546      -4.707677 1.3988639 -3.365357 7.644454e-04
## 3: ENSMUSG00000071713  9.742826      -4.186298 1.5134202 -2.766118 5.672806e-03
## 4: ENSMUSG00000087412  9.268108      -4.142177 1.2267197 -3.376629 7.338005e-04
## 5: ENSMUSG00000025929 39.258316      -3.504991 0.7640608 -4.587320 4.489727e-06
## 6: ENSMUSG00000061769 12.344903       3.297253 0.9505086  3.468936 5.225245e-04
##           padj   FDR Input Rank
## 1: 0.083024702 < 0.1   TPM    1
## 2: 0.024281473 < 0.1   TPM    2
## 3: 0.096622684 < 0.1   TPM    3
## 4: 0.023466641 < 0.1   TPM    4
## 5: 0.000570438 < 0.1   TPM    5
## 6: 0.018990240 < 0.1   TPM    6
head(lfc.rank[[1]][[2]])
##                  Gene  baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSMUSG00000087412  9.244459      -3.936202 1.1691184 -3.366812 7.604245e-04
## 2: ENSMUSG00000025929 39.611464      -3.516173 0.7543463 -4.661219 3.143427e-06
## 3: ENSMUSG00000061769 12.291486       3.259101 0.9527112  3.420870 6.242122e-04
## 4: ENSMUSG00000043932 14.922102       3.162490 0.8837090  3.578655 3.453674e-04
## 5: ENSMUSG00000037129 21.804492      -2.994600 0.6584157 -4.548190 5.410923e-06
## 6: ENSMUSG00000059136  9.189472       2.688384 0.9096773  2.955316 3.123490e-03
##            padj   FDR  Input Rank
## 1: 0.0244252678 < 0.1 Counts    1
## 2: 0.0003954606 < 0.1 Counts    2
## 3: 0.0207871846 < 0.1 Counts    3
## 4: 0.0140916134 < 0.1 Counts    4
## 5: 0.0006283608 < 0.1 Counts    5
## 6: 0.0670440156 < 0.1 Counts    6
head(up.lfc.rank[[1]][[1]])
##                  Gene baseMean log2FoldChange     lfcSE     stat       pvalue
## 1: ENSMUSG00000061769 12.34490       3.297253 0.9505086 3.468936 0.0005225245
## 2: ENSMUSG00000043932 15.01516       3.211369 0.8834582 3.634999 0.0002779822
## 3: ENSMUSG00000059136 14.52260       2.411968 0.7542152 3.197983 0.0013839223
## 4: ENSMUSG00000082901 12.00487       2.403901 0.7930771 3.031107 0.0024365917
## 5: ENSMUSG00000026241 18.25948       2.241226 0.6781347 3.304987 0.0009498089
## 6: ENSMUSG00000051726 15.38640       2.208510 0.5997559 3.682349 0.0002310946
##          padj   FDR Input Rank
## 1: 0.01899024 < 0.1   TPM    1
## 2: 0.01221303 < 0.1   TPM    2
## 3: 0.03830822 < 0.1   TPM    3
## 4: 0.05727209 < 0.1   TPM    4
## 5: 0.02862212 < 0.1   TPM    5
## 6: 0.01091174 < 0.1   TPM    6
head(up.lfc.rank[[1]][[2]])
##                  Gene  baseMean log2FoldChange     lfcSE     stat       pvalue
## 1: ENSMUSG00000061769 12.291486       3.259101 0.9527112 3.420870 0.0006242122
## 2: ENSMUSG00000043932 14.922102       3.162490 0.8837090 3.578655 0.0003453674
## 3: ENSMUSG00000059136  9.189472       2.688384 0.9096773 2.955316 0.0031234902
## 4: ENSMUSG00000082901 12.087585       2.412037 0.7922746 3.044445 0.0023310977
## 5: ENSMUSG00000026241 18.124144       2.255377 0.6737004 3.347745 0.0008147201
## 6: ENSMUSG00000051726 15.505212       2.228789 0.6016818 3.704265 0.0002120042
##          padj   FDR  Input Rank
## 1: 0.02078718 < 0.1 Counts    1
## 2: 0.01409161 < 0.1 Counts    2
## 3: 0.06704402 < 0.1 Counts    3
## 4: 0.05556601 < 0.1 Counts    4
## 5: 0.02544736 < 0.1 Counts    5
## 6: 0.01000174 < 0.1 Counts    6
head(down.lfc.rank[[1]][[1]])
##                  Gene  baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSMUSG00000078926  9.607045      -4.750698 1.6717195 -2.841804 4.485912e-03
## 2: ENSMUSG00000035606  9.077546      -4.707677 1.3988639 -3.365357 7.644454e-04
## 3: ENSMUSG00000071713  9.742826      -4.186298 1.5134202 -2.766118 5.672806e-03
## 4: ENSMUSG00000087412  9.268108      -4.142177 1.2267197 -3.376629 7.338005e-04
## 5: ENSMUSG00000025929 39.258316      -3.504991 0.7640608 -4.587320 4.489727e-06
## 6: ENSMUSG00000047501  7.042870      -3.211922 1.0593659 -3.031929 2.429966e-03
##           padj   FDR Input Rank
## 1: 0.083024702 < 0.1   TPM    1
## 2: 0.024281473 < 0.1   TPM    2
## 3: 0.096622684 < 0.1   TPM    3
## 4: 0.023466641 < 0.1   TPM    4
## 5: 0.000570438 < 0.1   TPM    5
## 6: 0.057272088 < 0.1   TPM    6
head(down.lfc.rank[[1]][[2]])
##                  Gene  baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSMUSG00000087412  9.244459      -3.936202 1.1691184 -3.366812 7.604245e-04
## 2: ENSMUSG00000025929 39.611464      -3.516173 0.7543463 -4.661219 3.143427e-06
## 3: ENSMUSG00000037129 21.804492      -2.994600 0.6584157 -4.548190 5.410923e-06
## 4: ENSMUSG00000070368 43.604624      -2.650362 0.6872414 -3.856522 1.150116e-04
## 5: ENSMUSG00000040724 25.325681      -2.519621 0.6750384 -3.732560 1.895438e-04
## 6: ENSMUSG00000030217 25.976658      -2.507083 0.5278218 -4.749866 2.035511e-06
##            padj   FDR  Input Rank
## 1: 0.0244252678 < 0.1 Counts    1
## 2: 0.0003954606 < 0.1 Counts    2
## 3: 0.0006283608 < 0.1 Counts    3
## 4: 0.0066780461 < 0.1 Counts    4
## 5: 0.0093309103 < 0.1 Counts    5
## 6: 0.0002880885 < 0.1 Counts    6

Calculating rank difference

# Set a function rebuilding DE tables with gene ranks 
rankdiff.fn <- function(List){

    # Select columns and join the data frames by gene
    full_join(List[[TvC[1]]][, .(Gene, Input, Rank, baseMean)], 
              List[[TvC[2]]][, .(Gene, Input, Rank, baseMean)], 
              by="Gene") %>%
    
    # Add columns assining gene expression levels, rank differences, and mean ranks
    mutate(logMeanExpression=log(baseMean.x+baseMean.y/2),
           RankDiff=Rank.x-Rank.y, 
           MeanRank=(Rank.x+Rank.y)/2)
} 

# Set a function adding Shrinkage column 
add.shr.fn <- function(df, shr) {mutate(df, Shrinkage=shr)} 

# Set a function rbinding multiple data frames 
rbinds.fn <- function(List) {rbind(List[[1]], 
                                   List[[2]], 
                                   List[[3]], 
                                   List[[4]])}



# Calculate and plot rank difference
for (i in shrinkage) {

    # Calculate rank difference
    fdr.rank[[i]] <- rankdiff.fn(fdr.rank[[i]]) %>% add.shr.fn(i)
    lfc.rank[[i]] <- rankdiff.fn(lfc.rank[[i]]) %>% add.shr.fn(i)
    up.lfc.rank[[i]] <- rankdiff.fn(up.lfc.rank[[i]]) %>% add.shr.fn(i)
    down.lfc.rank[[i]] <- rankdiff.fn(down.lfc.rank[[i]]) %>% add.shr.fn(i) 

}

# Create combined data frames across the shrinkages 
fdr.rank.df <- rbinds.fn(fdr.rank) 
lfc.rank.df <- rbinds.fn(lfc.rank)
up.lfc.rank.df <- rbinds.fn(up.lfc.rank)
down.lfc.rank.df <- rbinds.fn(down.lfc.rank)



# Explore the ranking outputs
head(fdr.rank.df)
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000022018     TPM      1   638.2601  Counts      1   641.9707
## 2: ENSMUSG00000040498     TPM      2   713.2059  Counts      2   717.1636
## 3: ENSMUSG00000032011     TPM      3 16087.6748  Counts      3 16169.4283
## 4: ENSMUSG00000047180     TPM      4  1002.6988  Counts      4  1008.5805
## 5: ENSMUSG00000020689     TPM      5  2308.2182  Counts      5  2322.4101
## 6: ENSMUSG00000024065     TPM      6  1102.5550  Counts      6  1109.4381
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          6.866147        0        1      None
## 2:          6.977083        0        2      None
## 3:         10.092966        0        3      None
## 4:          7.317869        0        4      None
## 5:          8.151744        0        5      None
## 6:          7.412929        0        6      None
head(lfc.rank.df)
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000078926     TPM      1   9.607045    <NA>     NA         NA
## 2: ENSMUSG00000035606     TPM      2   9.077546    <NA>     NA         NA
## 3: ENSMUSG00000071713     TPM      3   9.742826    <NA>     NA         NA
## 4: ENSMUSG00000087412     TPM      4   9.268108  Counts      1   9.244459
## 5: ENSMUSG00000025929     TPM      5  39.258316  Counts      2  39.611464
## 6: ENSMUSG00000061769     TPM      6  12.344903  Counts      3  12.291486
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:                NA       NA       NA      None
## 2:                NA       NA       NA      None
## 3:                NA       NA       NA      None
## 4:          2.631193        3      2.5      None
## 5:          4.078622        3      3.5      None
## 6:          2.917265        3      4.5      None
head(up.lfc.rank.df)
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000061769     TPM      1   12.34490  Counts      1  12.291486
## 2: ENSMUSG00000043932     TPM      2   15.01516  Counts      2  14.922102
## 3: ENSMUSG00000059136     TPM      3   14.52260  Counts      3   9.189472
## 4: ENSMUSG00000082901     TPM      4   12.00487  Counts      4  12.087585
## 5: ENSMUSG00000026241     TPM      5   18.25948  Counts      5  18.124144
## 6: ENSMUSG00000051726     TPM      6   15.38640  Counts      6  15.505212
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          2.917265        0        1      None
## 2:          3.112457        0        2      None
## 3:          2.950596        0        3      None
## 4:          2.893071        0        4      None
## 5:          3.307676        0        5      None
## 6:          3.141520        0        6      None
head(down.lfc.rank.df)
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000078926     TPM      1   9.607045    <NA>     NA         NA
## 2: ENSMUSG00000035606     TPM      2   9.077546    <NA>     NA         NA
## 3: ENSMUSG00000071713     TPM      3   9.742826    <NA>     NA         NA
## 4: ENSMUSG00000087412     TPM      4   9.268108  Counts      1   9.244459
## 5: ENSMUSG00000025929     TPM      5  39.258316  Counts      2  39.611464
## 6: ENSMUSG00000047501     TPM      6   7.042870    <NA>     NA         NA
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:                NA       NA       NA      None
## 2:                NA       NA       NA      None
## 3:                NA       NA       NA      None
## 4:          2.631193        3      2.5      None
## 5:          4.078622        3      3.5      None
## 6:                NA       NA       NA      None

Visualizing DEG ranks I: TPM- vs Counts-input

- x-axis: rank with TPM input

- y-axis: rank with Counts input

- Black diagonal lines: rank with TPM = rank with Counts

- Dot color: gene expression level (log-baseMean)

# Set a function plotting gene ranks between TPM- (x-axis) and Counts-Inputs (y-axis)
ranking.plot.fn <- function(df, rankedby) {

    df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)

    ggplot(df, 
           aes(x=Rank.x, y=Rank.y, color=logMeanExpression)) + geom_point(alpha=0.5) + facet_grid(~Shrinkage) + theme_bw() + theme(strip.text.x=element_text(size=10)) + xlab("Rank with TPM") + ylab("Rank with Counts") + geom_abline(slope=1, color="black", size=0.5) + ggtitle(paste(rankedby, "Ranking with TPM vs Count Inputs")) + scale_color_gradient(low="blue", high="red") 
}

# Print output plots
ranking.plot.fn(fdr.rank.df, "FDR")

ranking.plot.fn(lfc.rank.df, "Log2FoldChange")

ranking.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)")

ranking.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)")

Visualizing DEG ranks II: Rank difference

- x-axis: expression level (log-baseMean)

- y-axis: rank difference (rank with TPM - rank with Counts)

- Black horizontal lines: rank with TPM = rank with Counts

- Dot color: average rank

# Set a function plotting the rank difference over the gene expression level
rankdiff.plot.fn <- function(df, rankedby, ylim) {

    df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)

    ggplot(df, aes(x=logMeanExpression, y=RankDiff, color=MeanRank)) + 
        geom_point(alpha=0.5) + 
        theme_bw() + 
        facet_grid(~Shrinkage) + 
        theme(strip.text.x=element_text(size=10)) + 
        ylab("Rank Difference (TPM - Count)") + 
        ggtitle(paste("Rank Difference Inputs (TPM - Count):", rankedby)) + 
        geom_hline(yintercept=0, color="black", size=0.5) + scale_color_gradient(low="blue", high="red") +
        ylim(-ylim, ylim)
}

# Print output plots
rankdiff.plot.fn(fdr.rank.df, "FDR", 150)

rankdiff.plot.fn(lfc.rank.df, "Log2FoldChange", 200)

rankdiff.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)", 150)

rankdiff.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)", 160)

Distribution of rank difference in unshrunken data

- y-axis: abs(TPM-Count inputs)

- x-axis: FDR or log2FoldChange (Increase/Decrease)

- red dashed line: Rank difference = 25

- blue dashed line: Rank difference = 10

# Set a function subsetting unshrunken data
unshr.rankdiff.fn <- function(df) {subset(df, Shrinkage == "None")}

# Create a list storing rankdiff data frames 
rankList <- list(unshr.rankdiff.fn(fdr.rank.df),
                 unshr.rankdiff.fn(lfc.rank.df),
                 unshr.rankdiff.fn(up.lfc.rank.df),
                 unshr.rankdiff.fn(down.lfc.rank.df))


# Assine result column as a factor with levels 
rankdiff.levels <- c("FDR", 
                     "log2FoldChange", 
                     "log2FoldChange.Increase", 
                     "log2FoldChange.Decrease")


# Name the list 
names(rankList) <- rankdiff.levels

# Create a new data frame storing rank difference by result type
rankdiff.dist <- data.frame(FDR=abs(rankList[[1]]$RankDiff), 
                            log2FoldChange=abs(rankList[[2]]$RankDiff),
                            log2FoldChange.Increase=abs(rankList[[3]]$RankDiff),
                            log2FoldChange.Decrease=abs(rankList[[4]]$RankDiff)) %>% gather(Result, RankDiff) 

rankdiff.dist$Result <- factor(rankdiff.dist$Result, levels=rankdiff.levels)

# Plot distribution of absolute rank difference
ggplot(rankdiff.dist,
       aes(x=Result, y=RankDiff, color=Result)) +
geom_jitter(alpha=0.5) + 
geom_boxplot(alpha=0.5, fill="grey", color="black") + 
theme_bw() + 
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("Distribution of Absolute Rank Difference without Shrinkage") + 
ylab("Absolute Rank Difference") 

Relationship between rank difference and number of transcript versions

- y-axis: abs(TPM-Count inputs)

- x-axis: number of transcripts (number of transcript id / gene id)

- dot color: mean rank

- 16 genes were missing in the plots

# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>% 
    group_by(ENSEMBL) %>% 
    summarize(num.trans=n())

# Set a function adding the number of transcripts by gene id 
add.ntrans.fn <- function(df) {

    left_join(df, AnnoDb.ntrans, by=c("Gene"="ENSEMBL"))}




# Add a column indicating the number of transcripts by gene id to every rankdiff data frame
for (x in rankdiff.levels) {

    rankList[[x]] <- add.ntrans.fn(rankList[[x]])
}


# Explore the edited data frames
summary(rankList)
##                         Length Class      Mode
## FDR                     12     data.table list
## log2FoldChange          12     data.table list
## log2FoldChange.Increase 12     data.table list
## log2FoldChange.Decrease 12     data.table list
head(rankList[[1]])
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000022018     TPM      1   638.2601  Counts      1   641.9707
## 2: ENSMUSG00000040498     TPM      2   713.2059  Counts      2   717.1636
## 3: ENSMUSG00000032011     TPM      3 16087.6748  Counts      3 16169.4283
## 4: ENSMUSG00000047180     TPM      4  1002.6988  Counts      4  1008.5805
## 5: ENSMUSG00000020689     TPM      5  2308.2182  Counts      5  2322.4101
## 6: ENSMUSG00000024065     TPM      6  1102.5550  Counts      6  1109.4381
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          6.866147        0        1      None         1
## 2:          6.977083        0        2      None         4
## 3:         10.092966        0        3      None         6
## 4:          7.317869        0        4      None         2
## 5:          8.151744        0        5      None         2
## 6:          7.412929        0        6      None         2
head(rankList[[2]])
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000078926     TPM      1   9.607045    <NA>     NA         NA
## 2: ENSMUSG00000035606     TPM      2   9.077546    <NA>     NA         NA
## 3: ENSMUSG00000071713     TPM      3   9.742826    <NA>     NA         NA
## 4: ENSMUSG00000087412     TPM      4   9.268108  Counts      1   9.244459
## 5: ENSMUSG00000025929     TPM      5  39.258316  Counts      2  39.611464
## 6: ENSMUSG00000061769     TPM      6  12.344903  Counts      3  12.291486
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:                NA       NA       NA      None         5
## 2:                NA       NA       NA      None         2
## 3:                NA       NA       NA      None         8
## 4:          2.631193        3      2.5      None         2
## 5:          4.078622        3      3.5      None         1
## 6:          2.917265        3      4.5      None         1
head(rankList[[3]])
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000061769     TPM      1   12.34490  Counts      1  12.291486
## 2: ENSMUSG00000043932     TPM      2   15.01516  Counts      2  14.922102
## 3: ENSMUSG00000059136     TPM      3   14.52260  Counts      3   9.189472
## 4: ENSMUSG00000082901     TPM      4   12.00487  Counts      4  12.087585
## 5: ENSMUSG00000026241     TPM      5   18.25948  Counts      5  18.124144
## 6: ENSMUSG00000051726     TPM      6   15.38640  Counts      6  15.505212
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          2.917265        0        1      None         1
## 2:          3.112457        0        2      None         1
## 3:          2.950596        0        3      None         4
## 4:          2.893071        0        4      None         1
## 5:          3.307676        0        5      None         1
## 6:          3.141520        0        6      None         1
head(rankList[[4]])
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000078926     TPM      1   9.607045    <NA>     NA         NA
## 2: ENSMUSG00000035606     TPM      2   9.077546    <NA>     NA         NA
## 3: ENSMUSG00000071713     TPM      3   9.742826    <NA>     NA         NA
## 4: ENSMUSG00000087412     TPM      4   9.268108  Counts      1   9.244459
## 5: ENSMUSG00000025929     TPM      5  39.258316  Counts      2  39.611464
## 6: ENSMUSG00000047501     TPM      6   7.042870    <NA>     NA         NA
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:                NA       NA       NA      None         5
## 2:                NA       NA       NA      None         2
## 3:                NA       NA       NA      None         8
## 4:          2.631193        3      2.5      None         2
## 5:          4.078622        3      3.5      None         1
## 6:                NA       NA       NA      None         1
# Set a function plotting rank difference vs number of transcripts 
rank.ntrans.plot.fn <- function(df, title) {

    ggplot(df, aes(x=num.trans, y=abs(RankDiff), color=MeanRank)) + 
        geom_jitter(alpha=0.5) + 
        theme_bw() + 
        ggtitle(paste("Rank Difference vs Number of Transcripts in", title)) + 
        xlab("Number of Transcripts") + 
        ylab("Absolute Rank Difference (TPM - Counts)") + scale_color_gradient(low="blue", high="red") 
}

# Print the plots
rank.ntrans.plot.fn(rankList[[1]], "FDR")

rank.ntrans.plot.fn(rankList[[2]], "log2FoldChange")

rank.ntrans.plot.fn(rankList[[3]], "log2FoldChange (Increase)")

rank.ntrans.plot.fn(rankList[[4]], "log2FoldChange (Decrease)")

Finding genes having large difference in rankings

# Initialize a list storing rankdiff genes 
large.rankdiff <- rankList

# Assign a vector storing minimum (thresholds) rankdiff for filtering large rankdiff genes
rankdiff.thr <- c(10, 10, 10, 10)

names(rankdiff.thr) <- rankdiff.levels

for (x in rankdiff.levels) {

    # Filter out observations below the rankdiff thresholds
    large.rankdiff[[x]] <- subset(rankList[[x]], 
                                      abs(RankDiff) > rankdiff.thr[x]) 

}

# Explore the filtered genes 
summary(large.rankdiff)
##                         Length Class      Mode
## FDR                     12     data.table list
## log2FoldChange          12     data.table list
## log2FoldChange.Increase 12     data.table list
## log2FoldChange.Decrease 12     data.table list
dim(large.rankdiff[[rankdiff.levels[1]]])
## [1] 34 12
dim(large.rankdiff[[rankdiff.levels[2]]])
## [1]  3 12
dim(large.rankdiff[[rankdiff.levels[3]]])
## [1]  0 12
dim(large.rankdiff[[rankdiff.levels[4]]])
## [1]  0 12

Summarizing up/down DEGs with an upset plot

- red bar: input type

- blue bar: directionality of gene expression change

# Set a function cleaning data to generate upset plots 
upset.input.fn <- function(df) {

    df <- df %>% 

        # Filter genes with valid padj 
        filter(!is.na(padj)) %>% 
        
        mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes? 
               
               Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""),  # What are downregulated genes? 
               
               Unchanged=ifelse(FDR == FDRv[2], Gene, ""),   # What are unchanged genes? 
               
               TPM_Input=ifelse(Input == "TPM", Gene, ""),   # What are the genes from TPM input? 
               
               Counts_Input=ifelse(Input == "Counts", Gene, ""))   # What are the genes from Counts input? 

    # Create a list storing groups of interest
    upsetInput <- list(Up=df$Up, 
                       Down=df$Down, 
                       Unchanged=df$Unchanged, 
                       TPM_Input=df$TPM, 
                       Counts_Input=df$Counts) 

    return(upsetInput)

}

upsetList <- upset.input.fn(both.df)


# Create the upset plot 
upset(fromList(upsetList), 
      sets=names(upsetList),   # What group to display 
      sets.x.label="Number of Genes per Group",
      order.by="freq",
      point.size=3,
      sets.bar.color=c("red", "red","blue", "blue", "blue"),
      text.scale = 1.5, number.angles=30) 

Session Info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/mira/miniconda3/envs/r/lib/libopenblasp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] AnnotationDbi_1.52.0        ashr_2.2-47                
##  [3] apeglm_1.12.0               UpSetR_1.4.0               
##  [5] gridExtra_2.3               pheatmap_1.0.12            
##  [7] DESeq2_1.30.0               SummarizedExperiment_1.20.0
##  [9] Biobase_2.50.0              MatrixGenerics_1.2.0       
## [11] matrixStats_0.57.0          GenomicRanges_1.42.0       
## [13] GenomeInfoDb_1.26.2         IRanges_2.24.0             
## [15] S4Vectors_0.28.1            tximport_1.18.0            
## [17] forcats_0.5.0               stringr_1.4.0              
## [19] dplyr_1.0.2                 purrr_0.3.4                
## [21] readr_1.4.0                 tidyr_1.1.2                
## [23] tibble_3.0.4                ggplot2_3.3.2              
## [25] tidyverse_1.3.0             AnnotationHub_2.22.0       
## [27] BiocFileCache_1.14.0        dbplyr_2.0.0               
## [29] BiocGenerics_0.36.0         rmarkdown_2.5              
## [31] data.table_1.13.4          
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-0              ellipsis_0.3.1               
##  [3] XVector_0.30.0                fs_1.5.0                     
##  [5] rstudioapi_0.13               farver_2.0.3                 
##  [7] bit64_4.0.5                   mvtnorm_1.1-1                
##  [9] interactiveDisplayBase_1.28.0 fansi_0.4.1                  
## [11] lubridate_1.7.9.2             xml2_1.3.2                   
## [13] splines_4.0.3                 geneplotter_1.68.0           
## [15] knitr_1.30                    jsonlite_1.7.2               
## [17] broom_0.7.2                   annotate_1.68.0              
## [19] shiny_1.5.0                   BiocManager_1.30.10          
## [21] compiler_4.0.3                httr_1.4.2                   
## [23] backports_1.2.1               assertthat_0.2.1             
## [25] Matrix_1.2-18                 fastmap_1.0.1                
## [27] cli_2.2.0                     later_1.1.0.1                
## [29] htmltools_0.5.0               tools_4.0.3                  
## [31] coda_0.19-4                   gtable_0.3.0                 
## [33] glue_1.4.2                    GenomeInfoDbData_1.2.4       
## [35] rappdirs_0.3.1                Rcpp_1.0.5                   
## [37] bbmle_1.0.23.1                cellranger_1.1.0             
## [39] vctrs_0.3.5                   xfun_0.19                    
## [41] ps_1.5.0                      rvest_0.3.6                  
## [43] irlba_2.3.3                   mime_0.9                     
## [45] lifecycle_0.2.0               XML_3.99-0.5                 
## [47] MASS_7.3-53                   zlibbioc_1.36.0              
## [49] scales_1.1.1                  hms_0.5.3                    
## [51] promises_1.1.1                RColorBrewer_1.1-2           
## [53] yaml_2.2.1                    curl_4.3                     
## [55] memoise_1.1.0                 emdbook_1.3.12               
## [57] bdsmatrix_1.3-4               SQUAREM_2020.5               
## [59] stringi_1.5.3                 RSQLite_2.2.1                
## [61] BiocVersion_3.12.0            genefilter_1.72.0            
## [63] BiocParallel_1.24.1           truncnorm_1.0-8              
## [65] rlang_0.4.9                   pkgconfig_2.0.3              
## [67] bitops_1.0-6                  invgamma_1.1                 
## [69] evaluate_0.14                 lattice_0.20-41              
## [71] labeling_0.4.2                bit_4.0.4                    
## [73] tidyselect_1.1.0              plyr_1.8.6                   
## [75] magrittr_2.0.1                R6_2.5.0                     
## [77] generics_0.1.0                DelayedArray_0.16.0          
## [79] DBI_1.1.0                     pillar_1.4.7                 
## [81] haven_2.3.1                   withr_2.3.0                  
## [83] mixsqp_0.3-43                 survival_3.2-7               
## [85] RCurl_1.98-1.2                modelr_0.1.8                 
## [87] crayon_1.3.4                  locfit_1.5-9.4               
## [89] grid_4.0.3                    readxl_1.3.1                 
## [91] blob_1.2.1                    reprex_0.3.0                 
## [93] digest_0.6.27                 xtable_1.8-4                 
## [95] numDeriv_2016.8-1.1           httpuv_1.5.4                 
## [97] munsell_0.5.0